Chapter 3 Data quality

load("data/data.Rdata")

3.1 Load statistics

read_stats <- read_tsv("data/reports/reads_multiqc_general_stats.txt", 
                        col_types = cols_only("Sample" = col_character(),
                                              "FastQC_mqc-generalstats-fastqc-total_sequences" = col_double(),
                                              "FastQC_mqc-generalstats-fastqc-percent_gc" = col_double(),
                                              "FastQC_mqc-generalstats-fastqc-percent_duplicates" = col_double())) %>%
  mutate(Sample = str_extract(Sample, "M\\d+")) %>%
  rename(microsample=Sample,
         total_sequences="FastQC_mqc-generalstats-fastqc-total_sequences",
         percent_gc="FastQC_mqc-generalstats-fastqc-percent_gc",
         percent_dup="FastQC_mqc-generalstats-fastqc-percent_duplicates") %>%
  group_by(microsample) %>%
  summarise(total_sequences=sum(total_sequences), percent_unique=100-mean(percent_dup), percent_gc=mean(percent_gc))
host_mapping_stats <- read_tsv("data/reports/preprocess_samtools-flagstat-dp_Percentage_of_total.txt") %>%
    mutate(reference = case_when(
        grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
        grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
        TRUE ~ NA )) %>%
    filter(reference=="chicken") %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(microsample=Sample,reads_mapped_host_percent=`Properly Paired`) %>%
    select(microsample,reads_mapped_host_percent) %>%
    group_by(microsample) %>%
    summarise(reads_mapped_host_percent=mean(reads_mapped_host_percent))
human_mapping_stats <- read_tsv("data/reports/preprocess_samtools-flagstat-dp_Percentage_of_total.txt") %>%
    mutate(reference = case_when(
        grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
        grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
        TRUE ~ NA )) %>%
    filter(reference=="human") %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(microsample=Sample,reads_mapped_human_percent=`Properly Paired`) %>%
    select(microsample,reads_mapped_human_percent) %>%
    group_by(microsample) %>%
    summarise(reads_mapped_human_percent=mean(reads_mapped_human_percent))
quantification_stats  <- read_tsv("data/reports/quantify_multiqc_samtools_stats.txt") %>%
   filter(str_detect(Sample, "mgg-pbdrep")) %>%
   mutate(Sample = str_extract(Sample, "M\\d+")) %>%
   rename(microsample=Sample) %>%
    group_by(microsample) %>%
    summarise(reads_mapped=sum(reads_mapped),reads_mapped_percent=mean(reads_mapped_percent))
quality_stats <- read_stats %>%
    left_join(host_mapping_stats, by=join_by(microsample==microsample)) %>%
    left_join(human_mapping_stats, by=join_by(microsample==microsample)) %>%
    left_join(quantification_stats, by=join_by(microsample==microsample))

3.2 Individual overview

3.2.1 Sequencing depth

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
   mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=total_sequences,y=microsample))+
      geom_col()+
      geom_vline(xintercept=10000000, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Number of reads", y="Microsamples", fill="Library protocol")

3.2.2 Sequence duplication

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=percent_unique,y=microsample))+
      geom_col()+
      geom_vline(xintercept=35, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Percentage of non-duplicates", y="Microsamples", fill="Collection success")

3.2.3 GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=percent_gc,y=microsample))+
      geom_col()+
      geom_vline(xintercept=60, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Percentage of GC", y="Microsamples", fill="Library protocol")

3.2.4 Host %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=reads_mapped_host_percent,y=microsample))+
      geom_col()+
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Host %", y="Microsamples", fill="Library protocol")

3.2.5 Human %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=reads_mapped_human_percent,y=microsample))+
      geom_col()+
      geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Human %", y="Microsamples", fill="Library protocol")

3.2.6 Bacteria mapping %

quality_stats %>%
    left_join(sample_metadata,by="microsample") %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=reads_mapped_percent,y=microsample))+
      geom_col()+
      geom_vline(xintercept=75, linetype="dashed", color = "red", size=1) + 
      facet_nested(animal + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Mapped to MAGs (%)", y="Microsamples", fill="Library protocol")

3.2.7 Domain-adjusted mapping rate

3.3 Biplots

3.3.1 Sequencing depth vs. GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=percent_gc,y=total_sequences))+
      geom_point()+
      facet_nested(. ~ batch, scales="free") +
      labs(color="Sexrion")

3.3.2 Unique sequences vs. GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=percent_gc,y=percent_unique))+
      geom_point()+
      facet_nested(. ~ batch, scales="free")+
      labs(color="Library protocol")

3.4 Quality flagging

quality <- quality_stats %>%
    mutate(depth = case_when(
        total_sequences <= 10000000 ~ 0,
        total_sequences > 10000000 ~ 1,
        TRUE ~ NA)) %>%
    mutate(duplicates = case_when(
        percent_unique <= 35 ~ 0,
        percent_unique > 35 ~ 1,
        TRUE ~ NA)) %>%
    mutate(gc = case_when(
        percent_gc >= 60 ~ 0,
        percent_gc < 60 ~ 1,
        TRUE ~ NA)) %>%
    mutate(human = case_when(
        reads_mapped_human_percent >= 5 ~ 0,
        reads_mapped_human_percent < 5 ~ 1,
        TRUE ~ NA)) %>%
    mutate(bacteria = case_when(
        reads_mapped_percent <= 75 ~ 0,
        reads_mapped_percent > 75 ~ 1,
        TRUE ~ NA)) %>%
    select(microsample, depth, duplicates, gc, human, bacteria) %>%
    mutate(quality = depth + duplicates + gc + human + bacteria) %>%
    select(microsample, quality)

quality %>% write_tsv("results/quality.tsv")

3.4.1 Quality overview

quality %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    mutate(animal=substr(cryosection,1,4)) %>%
    ggplot(aes(x=quality,y=microsample))+
      geom_col()+
      geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Quality score", y="Microsamples", fill="Microsample area")

quality %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    group_by(section) %>%
    summarise(average=mean(quality, na.rm=TRUE), percentage_5 = mean(quality == 5, na.rm = TRUE) * 100) %>%
    tt()
tinytable_qajq4gf8nvjlc4pvziob
section average percentage_5
Caecum right 4.166667 53.33333